Importing Libraries

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(rlang)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggthemes)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.2     ✔ tibble  3.2.1
## ✔ readr   2.1.4     ✔ tidyr   1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::%@%()         masks rlang::%@%()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ purrr::flatten()     masks rlang::flatten()
## ✖ purrr::flatten_chr() masks rlang::flatten_chr()
## ✖ purrr::flatten_dbl() masks rlang::flatten_dbl()
## ✖ purrr::flatten_int() masks rlang::flatten_int()
## ✖ purrr::flatten_lgl() masks rlang::flatten_lgl()
## ✖ purrr::flatten_raw() masks rlang::flatten_raw()
## ✖ purrr::invoke()      masks rlang::invoke()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ purrr::splice()      masks rlang::splice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(corrplot)
## corrplot 0.92 loaded

loading the Dataset

Cleaned_bitcoin_mining <- read.csv("Cleaned_bitcoin_mining.csv")

head(Cleaned_bitcoin_mining)
##         Date.and.Time power.MAX..GW power.MIN..GW power.GUESS..GW
## 1 2010-07-18T00:00:00      2.67e-05      2.24e-05        2.44e-05
## 2 2010-07-19T00:00:00      2.68e-05      2.26e-05        2.46e-05
## 3 2010-07-20T00:00:00      2.72e-05      2.29e-05        2.50e-05
## 4 2010-07-21T00:00:00      2.84e-05      2.39e-05        2.61e-05
## 5 2010-07-22T00:00:00      2.82e-05      2.37e-05        2.59e-05
## 6 2010-07-23T00:00:00      2.85e-05      2.40e-05        2.61e-05
##   annualised.consumption.MAX..TWh annualised.consumption.MIN..TWh
## 1                     0.000233717                     0.000196712
## 2                     0.000235075                     0.000197855
## 3                     0.000238699                     0.000200905
## 4                     0.000249343                     0.000209864
## 5                     0.000247305                     0.000208148
## 6                     0.000250023                     0.000210436
##   annualised.consumption.GUESS..TWh Lower.bound.efficiency..J.Th
## 1                       0.000214241                     14313700
## 2                       0.000215486                     14313700
## 3                       0.000218808                     14313700
## 4                       0.000228565                     14313700
## 5                       0.000226696                     14313700
## 6                       0.000229188                     14313700
##   Estimated.efficiency..J.Th Upper.bound.efficiency..J.Th Hydro.only..MtCO2e
## 1                   14313700                     14313700              4e-06
## 2                   14313700                     14313700              5e-06
## 3                   14313700                     14313700              5e-06
## 4                   14313700                     14313700              5e-06
## 5                   14313700                     14313700              5e-06
## 6                   14313700                     14313700              5e-06
##   Estimated..MtCO2e Coal.only..MtCO2e Emission.intensity..gCO2e.kWh
## 1          0.000119          0.000214                      554.1215
## 2          0.000119          0.000216                      554.1215
## 3          0.000121          0.000219                      554.1215
## 4          0.000127          0.000229                      554.1215
## 5          0.000126          0.000227                      554.1215
## 6          0.000127          0.000229                      554.1215
##   Hash.rate.MH.s
## 1    0.001606373
## 2    0.001822962
## 3    0.001822962
## 4    0.001750766
## 5    0.001669545
## 6    0.001669545

Checking the dimension and Structure of data

dim(Cleaned_bitcoin_mining)
## [1] 4815   15
str(Cleaned_bitcoin_mining)
## 'data.frame':    4815 obs. of  15 variables:
##  $ Date.and.Time                    : chr  "2010-07-18T00:00:00" "2010-07-19T00:00:00" "2010-07-20T00:00:00" "2010-07-21T00:00:00" ...
##  $ power.MAX..GW                    : num  2.67e-05 2.68e-05 2.72e-05 2.84e-05 2.82e-05 2.85e-05 2.86e-05 2.99e-05 3.15e-05 3.23e-05 ...
##  $ power.MIN..GW                    : num  2.24e-05 2.26e-05 2.29e-05 2.39e-05 2.37e-05 2.40e-05 2.41e-05 2.52e-05 2.65e-05 2.72e-05 ...
##  $ power.GUESS..GW                  : num  2.44e-05 2.46e-05 2.50e-05 2.61e-05 2.59e-05 2.61e-05 2.62e-05 2.74e-05 2.88e-05 2.96e-05 ...
##  $ annualised.consumption.MAX..TWh  : num  0.000234 0.000235 0.000239 0.000249 0.000247 ...
##  $ annualised.consumption.MIN..TWh  : num  0.000197 0.000198 0.000201 0.00021 0.000208 ...
##  $ annualised.consumption.GUESS..TWh: num  0.000214 0.000215 0.000219 0.000229 0.000227 ...
##  $ Lower.bound.efficiency..J.Th     : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Estimated.efficiency..J.Th       : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Upper.bound.efficiency..J.Th     : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Hydro.only..MtCO2e               : num  4e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 ...
##  $ Estimated..MtCO2e                : num  0.000119 0.000119 0.000121 0.000127 0.000126 0.000127 0.000127 0.000133 0.00014 0.000144 ...
##  $ Coal.only..MtCO2e                : num  0.000214 0.000216 0.000219 0.000229 0.000227 0.000229 0.00023 0.000241 0.000253 0.00026 ...
##  $ Emission.intensity..gCO2e.kWh    : num  554 554 554 554 554 ...
##  $ Hash.rate.MH.s                   : num  0.00161 0.00182 0.00182 0.00175 0.00167 ...

Summary Statistics

summary(Cleaned_bitcoin_mining)
##  Date.and.Time      power.MAX..GW      power.MIN..GW      power.GUESS..GW    
##  Length:4815        Min.   : 0.00003   Min.   :0.000022   Min.   : 0.000024  
##  Class :character   1st Qu.: 0.39179   1st Qu.:0.031152   1st Qu.: 0.154086  
##  Mode  :character   Median : 2.12457   Median :0.384142   Median : 0.905217  
##                     Mean   : 9.82974   Mean   :2.039373   Mean   : 3.989582  
##                     3rd Qu.:15.41883   3rd Qu.:4.049493   3rd Qu.: 7.710647  
##                     Max.   :56.01570   Max.   :8.947454   Max.   :15.063222  
##  annualised.consumption.MAX..TWh annualised.consumption.MIN..TWh
##  Min.   :  0.0002                Min.   : 0.0002                
##  1st Qu.:  3.4344                1st Qu.: 0.2731                
##  Median : 18.6240                Median : 3.3674                
##  Mean   : 86.1675                Mean   :17.8771                
##  3rd Qu.:135.1615                3rd Qu.:35.4978                
##  Max.   :491.0337                Max.   :78.4334                
##  annualised.consumption.GUESS..TWh Lower.bound.efficiency..J.Th
##  Min.   :  0.00021                 Min.   :      21            
##  1st Qu.:  1.35072                 1st Qu.:      38            
##  Median :  7.93513                 Median :      98            
##  Mean   : 34.97267                 Mean   :  458086            
##  3rd Qu.: 67.59153                 3rd Qu.:    9917            
##  Max.   :132.04420                 Max.   :14313700            
##  Estimated.efficiency..J.Th Upper.bound.efficiency..J.Th Hydro.only..MtCO2e
##  Min.   :      31           Min.   :      46             Min.   :0.000004  
##  1st Qu.:      68           1st Qu.:     167             1st Qu.:0.028365  
##  Median :     261           Median :     766             Median :0.166638  
##  Mean   :  771891           Mean   : 1292594             Mean   :0.734426  
##  3rd Qu.:   36553           3rd Qu.:   75000             3rd Qu.:1.419422  
##  Max.   :14313700           Max.   :14313700             Max.   :2.772928  
##  Estimated..MtCO2e  Coal.only..MtCO2e   Emission.intensity..gCO2e.kWh
##  Min.   : 0.00012   Min.   :  0.00021   Min.   :359.5                
##  1st Qu.: 0.75628   1st Qu.:  1.35207   1st Qu.:512.8                
##  Median : 4.22858   Median :  7.94307   Median :533.7                
##  Mean   :17.95686   Mean   : 35.00765   Mean   :532.2                
##  3rd Qu.:31.96006   3rd Qu.: 67.65912   3rd Qu.:559.0                
##  Max.   :66.90830   Max.   :132.17625   Max.   :594.6                
##  Hash.rate.MH.s     
##  Min.   :        0  
##  1st Qu.:     3838  
##  Median :  3210303  
##  Mean   : 64397862  
##  3rd Qu.:111495251  
##  Max.   :506061817

From the summary Statistics, we can see the distribution and range of each variable, as well as the presence of missing values.

Data cleaning

Checking for missing values

sum(is.na(Cleaned_bitcoin_mining))
## [1] 0

There are No Null values as this is the Cleaned dataset, Every column has complete data for all the rows.

Checking number of Unique values

sapply(Cleaned_bitcoin_mining, function(x) length(unique(x)))
##                     Date.and.Time                     power.MAX..GW 
##                              4815                              4767 
##                     power.MIN..GW                   power.GUESS..GW 
##                              4745                              4771 
##   annualised.consumption.MAX..TWh   annualised.consumption.MIN..TWh 
##                              4771                              4750 
## annualised.consumption.GUESS..TWh      Lower.bound.efficiency..J.Th 
##                              4774                                24 
##        Estimated.efficiency..J.Th      Upper.bound.efficiency..J.Th 
##                               275                                44 
##                Hydro.only..MtCO2e                 Estimated..MtCO2e 
##                              4543                              4757 
##                 Coal.only..MtCO2e     Emission.intensity..gCO2e.kWh 
##                              4761                                39 
##                    Hash.rate.MH.s 
##                              3801

Date and time has 4815 unique values which means that each row corresponds to a unique timestamp. Most of the columns have a large number of unique values, suggesting continous data, but few columns like ” lower Bound eficiency, J/th”, “Upper bound efficiency, J/th”, and “Emission intensity, gCO2e/kWh” have fewer values, indicating potential categories or repeated measurements.

Changing of “data and time” datatype

Cleaned_bitcoin_mining$'Date.and.Time' <- as.POSIXct(Cleaned_bitcoin_mining$'Date.and.Time',format= "%Y-%m-%dT%H:%M:%S")

 str(Cleaned_bitcoin_mining)
## 'data.frame':    4815 obs. of  15 variables:
##  $ Date.and.Time                    : POSIXct, format: "2010-07-18" "2010-07-19" ...
##  $ power.MAX..GW                    : num  2.67e-05 2.68e-05 2.72e-05 2.84e-05 2.82e-05 2.85e-05 2.86e-05 2.99e-05 3.15e-05 3.23e-05 ...
##  $ power.MIN..GW                    : num  2.24e-05 2.26e-05 2.29e-05 2.39e-05 2.37e-05 2.40e-05 2.41e-05 2.52e-05 2.65e-05 2.72e-05 ...
##  $ power.GUESS..GW                  : num  2.44e-05 2.46e-05 2.50e-05 2.61e-05 2.59e-05 2.61e-05 2.62e-05 2.74e-05 2.88e-05 2.96e-05 ...
##  $ annualised.consumption.MAX..TWh  : num  0.000234 0.000235 0.000239 0.000249 0.000247 ...
##  $ annualised.consumption.MIN..TWh  : num  0.000197 0.000198 0.000201 0.00021 0.000208 ...
##  $ annualised.consumption.GUESS..TWh: num  0.000214 0.000215 0.000219 0.000229 0.000227 ...
##  $ Lower.bound.efficiency..J.Th     : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Estimated.efficiency..J.Th       : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Upper.bound.efficiency..J.Th     : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Hydro.only..MtCO2e               : num  4e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 ...
##  $ Estimated..MtCO2e                : num  0.000119 0.000119 0.000121 0.000127 0.000126 0.000127 0.000127 0.000133 0.00014 0.000144 ...
##  $ Coal.only..MtCO2e                : num  0.000214 0.000216 0.000219 0.000229 0.000227 0.000229 0.00023 0.000241 0.000253 0.00026 ...
##  $ Emission.intensity..gCO2e.kWh    : num  554 554 554 554 554 ...
##  $ Hash.rate.MH.s                   : num  0.00161 0.00182 0.00182 0.00175 0.00167 ...
 class(Cleaned_bitcoin_mining$Date.and.Time)
## [1] "POSIXct" "POSIXt"
 date_range <- range(Cleaned_bitcoin_mining$Date.and.Time)
 
 date_range
## [1] "2010-07-18 EDT" "2023-09-22 EDT"

we are changing the data and time datatype to POSIXct as many plotting functions understand ’POSIXct/ POSIXit and will correctly format axes and labels when ploting datetime values, and is better for data manipulations and operations.

Univariate Analysis

variables <- c('power.GUESS..GW', 'annualised.consumption.GUESS..TWh', 'Estimated.efficiency..J.Th', 
               'Hydro.only..MtCO2e', 'Estimated..MtCO2e', 'Coal.only..MtCO2e', 
               'Emission.intensity..gCO2e.kWh', 'Hash.rate.MH.s')



# Improve variable names for display
var_names <- c('Power (GW)', 'Annualised Consumption (TWh)', 'Estimated Efficiency (J/Th)', 
               'Hydro Only Emissions (MtCO2e)', 'Estimated Emissions (MtCO2e)', 'Coal Only Emissions (MtCO2e)', 
               'Emission Intensity (gCO2e/kWh)', 'Hash Rate (MH/s)')

# Convert data to long format for facetting
df_long <- Cleaned_bitcoin_mining %>%
  select(all_of(variables)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

df_long$Variable <- factor(df_long$Variable, levels = variables, labels = var_names)

# Plot
p <- ggplot(df_long, aes(x = Value)) + 
  geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
  geom_freqpoly(color = "#e34a33", size = 1) +
  facet_wrap(~ Variable, scales = "free", ncol = 2) +
  theme_minimal() + 
  labs(title = "Histograms of Selected Variables", y = "Frequency") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

for(i in 1:length(variables)) {
    # Subset data for the variable
    df_subset <- df_long[df_long$Variable == var_names[i], ]
    
    p <- ggplot(df_subset, aes(x = Value)) + 
      geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
      geom_freqpoly(color = "#e34a33", size = 1) +
      labs(title = paste("Histogram of", var_names[i]), y = "Frequency") +
      theme_minimal() + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    
    
    print(p)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Outliers

Boxplots to visualize outliers

for(i in 1:length(variables)) {
  p <- ggplot(Cleaned_bitcoin_mining, aes(y = Cleaned_bitcoin_mining[[variables[i]]])) + 
    geom_boxplot(fill = '#66c2a5', color = '#004d40', outlier.color = "red", outlier.size = 2) +
    labs(title = paste("Box Plot of", var_names[i]), y = var_names[i]) +
    theme_minimal() 
  
  print(p)
}
## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

IQR

variables <- c('power.GUESS..GW', 'annualised.consumption.GUESS..TWh', 'Estimated.efficiency..J.Th', 
               'Hydro.only..MtCO2e', 'Estimated..MtCO2e', 'Coal.only..MtCO2e', 
               'Emission.intensity..gCO2e.kWh', 'Hash.rate.MH.s')

outliers_counts <- sapply(variables, function(var) {

  Q1 <- quantile(Cleaned_bitcoin_mining[[var]], 0.25)
  Q3 <- quantile(Cleaned_bitcoin_mining[[var]], 0.75)
  IQR <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  outliers <- Cleaned_bitcoin_mining[[var]][Cleaned_bitcoin_mining[[var]] < lower_bound | 
                                           Cleaned_bitcoin_mining[[var]] > upper_bound]
  
  length(outliers)
})

names(outliers_counts) <- variables

outliers_counts
##                   power.GUESS..GW annualised.consumption.GUESS..TWh 
##                                 0                                 0 
##        Estimated.efficiency..J.Th                Hydro.only..MtCO2e 
##                              1097                                 0 
##                 Estimated..MtCO2e                 Coal.only..MtCO2e 
##                                 0                                 0 
##     Emission.intensity..gCO2e.kWh                    Hash.rate.MH.s 
##                               214                               254

Cap/Floor Outliers

Cleaned_bitcoin_mining_copy <- Cleaned_bitcoin_mining

for(var in variables) {
  
  Q1 <- quantile(Cleaned_bitcoin_mining_copy[[var]], 0.25)
  Q3 <- quantile(Cleaned_bitcoin_mining_copy[[var]], 0.75)
  IQR <- Q3 - Q1
  

  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  Cleaned_bitcoin_mining_copy[[var]] <- ifelse(Cleaned_bitcoin_mining_copy[[var]] < lower_bound, lower_bound, 
                                               ifelse(Cleaned_bitcoin_mining_copy[[var]] > upper_bound, upper_bound, 
                                                      Cleaned_bitcoin_mining_copy[[var]]))
}

summary(Cleaned_bitcoin_mining_copy[variables])
##  power.GUESS..GW     annualised.consumption.GUESS..TWh
##  Min.   : 0.000024   Min.   :  0.00021                
##  1st Qu.: 0.154086   1st Qu.:  1.35072                
##  Median : 0.905217   Median :  7.93513                
##  Mean   : 3.989582   Mean   : 34.97267                
##  3rd Qu.: 7.710647   3rd Qu.: 67.59153                
##  Max.   :15.063222   Max.   :132.04420                
##  Estimated.efficiency..J.Th Hydro.only..MtCO2e Estimated..MtCO2e 
##  Min.   :   31.13           Min.   :0.000004   Min.   : 0.00012  
##  1st Qu.:   67.72           1st Qu.:0.028365   1st Qu.: 0.75628  
##  Median :  260.92           Median :0.166638   Median : 4.22858  
##  Mean   :23180.17           Mean   :0.734426   Mean   :17.95686  
##  3rd Qu.:36553.00           3rd Qu.:1.419422   3rd Qu.:31.96006  
##  Max.   :91280.91           Max.   :2.772928   Max.   :66.90830  
##  Coal.only..MtCO2e   Emission.intensity..gCO2e.kWh Hash.rate.MH.s     
##  Min.   :  0.00021   Min.   :443.5                 Min.   :        0  
##  1st Qu.:  1.35207   1st Qu.:512.8                 1st Qu.:     3838  
##  Median :  7.94307   Median :533.7                 Median :  3210303  
##  Mean   : 35.00765   Mean   :534.2                 Mean   : 60413666  
##  3rd Qu.: 67.65912   3rd Qu.:559.0                 3rd Qu.:111495251  
##  Max.   :132.17625   Max.   :594.6                 Max.   :278732371
for (var in variables) {
  
  p <- ggplot(Cleaned_bitcoin_mining_copy, aes_string(x = var)) + 
    geom_histogram(aes(y = ..count..),fill = '#66c2a5', color = '#004d40', bins = 30) +
    geom_freqpoly(color = "#e34a33", size = 1) +
    labs(title = paste("Histogram of", var, "after Capping/Flooring"), y = "Frequency") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Apply log transformation

Cleaned_bitcoin_mining_log <- Cleaned_bitcoin_mining
for (var in variables) {
  Cleaned_bitcoin_mining_log[[paste0("log_", var)]] <- log1p(Cleaned_bitcoin_mining[[var]])
}

# Visualize the log-transformed data
for (var in paste0("log_", variables)) {
  # Plot
  p <- ggplot(Cleaned_bitcoin_mining_log, aes_string(x = var)) + 
    geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
    geom_freqpoly(color = "#e34a33", size = 1) +
    labs(title = paste("Histogram of Log Transformed", var), y = "Frequency") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(p)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Segmaentation Analysis

It’s a method used to divide a data set into subsets (with outliers(original data) & without outliers)

data_without_outliers <- Cleaned_bitcoin_mining

for (var in variables) {
  
  Q1 <- quantile(Cleaned_bitcoin_mining[[var]], 0.25)
  Q3 <- quantile(Cleaned_bitcoin_mining[[var]], 0.75)
  IQR <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  data_without_outliers <- data_without_outliers[data_without_outliers[[var]] >= lower_bound & data_without_outliers[[var]] <= upper_bound, ]
}

data_with_outliers <- Cleaned_bitcoin_mining

summary_without_outliers <- summary(data_without_outliers[variables])
summary_with_outliers <- summary(data_with_outliers[variables])

list(Without_Outliers = summary_without_outliers, With_Outliers = summary_with_outliers)
## $Without_Outliers
##  power.GUESS..GW     annualised.consumption.GUESS..TWh
##  Min.   : 0.000478   Min.   :  0.00419                
##  1st Qu.: 0.539391   1st Qu.:  4.72830                
##  Median : 3.611244   Median : 31.65617                
##  Mean   : 4.398428   Mean   : 38.55662                
##  3rd Qu.: 8.442700   3rd Qu.: 74.00870                
##  Max.   :13.266792   Max.   :116.29670                
##  Estimated.efficiency..J.Th Hydro.only..MtCO2e Estimated..MtCO2e 
##  Min.   :   33.43           Min.   :0.000088   Min.   : 0.00236  
##  1st Qu.:   68.34           1st Qu.:0.099294   1st Qu.: 2.57071  
##  Median :  182.88           Median :0.664779   Median :16.67565  
##  Mean   : 3524.93           Mean   :0.809689   Mean   :20.32547  
##  3rd Qu.:  850.32           3rd Qu.:1.554183   3rd Qu.:38.74943  
##  Max.   :58750.00           Max.   :2.442231   Max.   :64.73054  
##  Coal.only..MtCO2e  Emission.intensity..gCO2e.kWh Hash.rate.MH.s     
##  Min.   :  0.0042   Min.   :462.5                 Min.   :        7  
##  1st Qu.:  4.7330   1st Qu.:512.9                 1st Qu.:   450189  
##  Median : 31.6878   Median :533.7                 Median : 15023580  
##  Mean   : 38.5952   Mean   :533.9                 Mean   : 60112171  
##  3rd Qu.: 74.0827   3rd Qu.:554.5                 3rd Qu.:112650400  
##  Max.   :116.4130   Max.   :594.6                 Max.   :277924882  
## 
## $With_Outliers
##  power.GUESS..GW     annualised.consumption.GUESS..TWh
##  Min.   : 0.000024   Min.   :  0.00021                
##  1st Qu.: 0.154086   1st Qu.:  1.35072                
##  Median : 0.905217   Median :  7.93513                
##  Mean   : 3.989582   Mean   : 34.97267                
##  3rd Qu.: 7.710647   3rd Qu.: 67.59153                
##  Max.   :15.063222   Max.   :132.04420                
##  Estimated.efficiency..J.Th Hydro.only..MtCO2e Estimated..MtCO2e 
##  Min.   :      31           Min.   :0.000004   Min.   : 0.00012  
##  1st Qu.:      68           1st Qu.:0.028365   1st Qu.: 0.75628  
##  Median :     261           Median :0.166638   Median : 4.22858  
##  Mean   :  771891           Mean   :0.734426   Mean   :17.95686  
##  3rd Qu.:   36553           3rd Qu.:1.419422   3rd Qu.:31.96006  
##  Max.   :14313700           Max.   :2.772928   Max.   :66.90830  
##  Coal.only..MtCO2e   Emission.intensity..gCO2e.kWh Hash.rate.MH.s     
##  Min.   :  0.00021   Min.   :359.5                 Min.   :        0  
##  1st Qu.:  1.35207   1st Qu.:512.8                 1st Qu.:     3838  
##  Median :  7.94307   Median :533.7                 Median :  3210303  
##  Mean   : 35.00765   Mean   :532.2                 Mean   : 64397862  
##  3rd Qu.: 67.65912   3rd Qu.:559.0                 3rd Qu.:111495251  
##  Max.   :132.17625   Max.   :594.6                 Max.   :506061817

Bi variate Analysis

Corelation Matrix

cor_matrix <- cor(Cleaned_bitcoin_mining[variables], use = "complete.obs")


col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

corrplot(cor_matrix, method = "color", type = "upper", 
         col = col(200),     
         tl.col = "black",  
         tl.srt = 90,         
         order = "hclust",    
         addCoef.col = "black", 
         number.cex = 0.5,  
         title = "Correlation Matrix", mar=c(0,0,1,0))

Highly Correlated Variables :

Sample T-test to compare the Power.Guess..GW before and after jan 1st 2013

before_2013 <- subset(Cleaned_bitcoin_mining, Date.and.Time < as.Date("2013-01-03"))
after_2013 <- subset(Cleaned_bitcoin_mining, Date.and.Time >= as.Date("2013-01-03"))

t_result <- t.test(before_2013$power.GUESS..GW, after_2013$power.GUESS..GW)

print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  before_2013$power.GUESS..GW and after_2013$power.GUESS..GW
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.036382 -4.749997
## sample estimates:
##  mean of x  mean of y 
## 0.01100724 4.90419668

T-test for Selected Variables

results <- list()

for(var in variables) {

    if(any(is.na(before_2013[[var]])) || any(is.na(after_2013[[var]]))) {
        results[[var]] <- "Contains NA values"
    } else if(length(unique(before_2013[[var]])) == 1 || length(unique(after_2013[[var]])) == 1) {

        results[[var]] <- "Constant values in one or both periods"
    } else {
        result <- t.test(before_2013[[var]], after_2013[[var]])
        results[[var]] <- result
    }
}

for(var in variables) {
    cat("T-test results for", var, ":\n")
    print(results[[var]])
    cat("\n---------------------------------------------\n")
}
## T-test results for power.GUESS..GW :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.036382 -4.749997
## sample estimates:
##  mean of x  mean of y 
## 0.01100724 4.90419668 
## 
## 
## ---------------------------------------------
## T-test results for annualised.consumption.GUESS..TWh :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -44.14892 -41.63848
## sample estimates:
##   mean of x   mean of y 
##  0.09648949 42.99018811 
## 
## 
## ---------------------------------------------
## T-test results for Estimated.efficiency..J.Th :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = 25.546, df = 899.31, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3659260 4268296
## sample estimates:
##  mean of x  mean of y 
## 3994776.19   30998.28 
## 
## 
## ---------------------------------------------
## T-test results for Hydro.only..MtCO2e :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.9271274 -0.8744080
## sample estimates:
##   mean of x   mean of y 
## 0.002026276 0.902793953 
## 
## 
## ---------------------------------------------
## T-test results for Estimated..MtCO2e :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -66.447, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -22.66799 -21.36865
## sample estimates:
##   mean of x   mean of y 
##  0.05411622 22.07243658 
## 
## 
## ---------------------------------------------
## T-test results for Coal.only..MtCO2e :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -44.19307 -41.68011
## sample estimates:
##   mean of x   mean of y 
##  0.09658598 43.03317831 
## 
## 
## ---------------------------------------------
## T-test results for Emission.intensity..gCO2e.kWh :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = 53.295, df = 4099.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32.17905 34.63700
## sample estimates:
## mean of x mean of y 
##  559.4096  526.0015 
## 
## 
## ---------------------------------------------
## T-test results for Hash.rate.MH.s :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -47.902, df = 3914, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -82443618 -75960296
## sample estimates:
##    mean of x    mean of y 
## 9.051497e+00 7.920197e+07 
## 
## 
## ---------------------------------------------